This document demonstrates the use of the bRF and LASSO-D3S functions for integrative GRN inference.

Those functions infer the regulatory pathways of Arabidopsis thaliana’s roots in response to nitrate (N) induction from Varala et al., 2018.

They use as inputs the expression profiles of N-responsive genes and TFBS information. Prior TFBS information was built by searching in the promoters of the N-responsive genes the PWM of the N-responsive regulators.

Data import

Expression data

load('rdata/inference_input_N_response_varala.rdata')
genes <- input_data$grouped_genes; length(genes)
## [1] 1426
tfs <- input_data$grouped_regressors; length(tfs)
## [1] 201
counts <- input_data$counts; dim(counts)
## [1] 1426   45

TFBS data

load("rdata/pwm_occurrences_N_response_varala.rdata")
dim(pwm_occurrence)
## [1] 1426  201

GRN inference

How does the individual MSE of genes varies with \(\alpha\) for model estimation in GRN inferences?

bRf : biased Random Forests

# ALPHAS <- seq(0,1, by = 0.1)
# lmses <- data.frame(row.names = genes)
# 
# for(alpha in ALPHAS){
#   set.seed(121314)
#   lmses[,as.character(alpha)] <- bRF_inference_MSE(counts, genes, tfs, alpha = alpha, nTrees = 1000,
#                              pwm_occurrence = pwm_occurrence, nCores = 18)
# }
# 
# save(lmses, file = "results/logMSES_RFs_per_genes.rdata")

Clustering the genes :

load(file = "results/logMSES_RFs_per_genes.rdata")

mse <- (lmses-rowMeans(lmses)) / matrixStats::rowSds(as.matrix(lmses))

ha = HeatmapAnnotation(
    alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
    annotation_name_side = "left")



heatmap_rf <- Heatmap(mse, row_km = 3, cluster_columns = F, show_row_names = F, 
        name = "scaled logMSE (z-score)", top_annotation = ha,
        row_title = "Genes", column_title = "alpha"); heatmap_rf

LASSO-D3S

# lmses_lasso <- data.frame(row.names = genes)
# 
# for(alpha in ALPHAS){
#   set.seed(121314)
#   lmses_lasso[,as.character(alpha)] <- LASSO.D3S_inference_MSE(counts, genes, tfs,
#                                                                normalized = TRUE, 
#                                                                alpha = alpha, N = 50,
#                                pwm_occurrence = pwm_occurrence, nCores = 15)
# }
# 
# save(lmses_lasso, file = "results/logMSES_Lasso_per_genes.rdata")
load(file = "results/logMSES_Lasso_per_genes.rdata")
mse_lasso <- (lmses_lasso-rowMeans(lmses_lasso)) / matrixStats::rowSds(as.matrix(lmses_lasso))

ha = HeatmapAnnotation(
    alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
    annotation_name_side = "left")


heatmap_lasso <- Heatmap(mse_lasso, row_km = 3, cluster_columns = F, show_row_names = F, 
        name = "scaled logMSE (z-score)", top_annotation = ha,
        row_title = "Genes", column_title = "alpha"); heatmap_lasso

Clustering global des gènes suivant alpha et les deux méthodes

Il semblerait que les gènes dont la MSE diminue ne soient pas les mêmes suivant bRF et LASSO, seul un petit sous ensemble est commun:

hall = HeatmapAnnotation(
  method = c(rep("bRF", 11), rep("lasso", 11)),
    alpha = anno_simple(as.numeric(rep(colnames(mse),2))),
    annotation_name_side = "left")

Heatmap(cbind(mse, mse_lasso), row_km = 6, cluster_columns = F, show_row_names = F, 
        name = "scaled logMSE", top_annotation = hall, clustering_distance_rows = "spearman",
        row_title = "Genes")

Y-a-til des gènes qu’on n’arrive pas à bien prédire, dans aucun cas?

On aimerait exclure ou en tout cas identifier dans ces graphiques s’il y a des gènes que l’on prédit mal quel que soit alpha.

En fait les gènes sur lesquels on se trompe le plus sont aussi les gènes les plus exprimés… Donc on va normaliser la MSE par la variance du gène cible pour avoir des MSE comparables entre gènes.

mean_expr <- rowMeans(counts)
var_expr <- matrixStats::rowSds(counts)*matrixStats::rowSds(counts)

all_mses <- cbind(lmses, lmses_lasso)
#all_mses_scaled <- (all_mses-rowMeans(all_mses)) / matrixStats::rowSds(as.matrix(all_mses))
all_mses_scaled <- exp(all_mses) / var_expr

min_mses <- matrixStats::rowMins(as.matrix(all_mses_scaled))

min_mses_lasso <- matrixStats::rowMins(as.matrix(exp(lmses_lasso)/var_expr))
min_mses_rf <- matrixStats::rowMins(as.matrix(exp(lmses)/var_expr))

hist(min_mses_lasso, breaks = 100)

hist(min_mses_rf, breaks = 100)

hist(min_mses, breaks = 100)

# df <- data.frame(mean_expr)
bad_genes_lasso <- genes[which(min_mses_lasso > 0.3)]
bad_genes_rf <- genes[which(min_mses_rf > 0.3)]


bad_genes_lasso <- min_mses_lasso > 0.3
bad_genes_rf <- min_mses_rf > 0.3

# df$bad <- rownames(df) %in% bad_genes
# df$min_lmse <- min_mses
# ggplot(df, aes(x=bad, y=mean_expr)) + geom_boxplot()
# ggplot(df, aes(x=log(mean_expr), y=min_lmse)) + geom_point()
# 
mse_lasso_scaled <- exp(lmses_lasso)/var_expr
mse_rf_scaled <- exp(lmses)/var_expr


# mse_lasso_scaled[bad_genes_lasso>0.3,]<-NA
# mse_rf_scaled[bad_genes_rf>0.3,]<-NA
# 

# 
# ha = HeatmapAnnotation(
#     alpha = anno_simple(as.numeric(rep(colnames(mse),1))),
#     annotation_name_side = "left")
# 
# Heatmap(mse_lasso_scaled, row_km = 0, cluster_columns = F, show_row_names = F, 
#         name = "scaled logMSE (z-score)", top_annotation = ha, 
#         row_title = "Genes", column_title = "alpha") + rowAnnotation(
#     scaled_min_mse = min_mses_lasso)
# 
# Heatmap(mse_rf_scaled, row_km = 0, cluster_columns = F, show_row_names = F, 
#         name = "scaled logMSE (z-score)", top_annotation = ha,
#         row_title = "Genes", column_title = "alpha")+ rowAnnotation(
#     scaled_min_mse = min_mses_rf)

Sans normaliser par la MSE moyenne par gène, on ne voit plus les variations de MSE liées à alpha, on voit uniquement les gènes qui sont globalement bien ou mal prédits. Deux méthodes simultanément :

Heatmap(log(all_mses_scaled), row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, col = rev(hcl.colors(5, palette = "Reds 2")),
        row_title = "Genes")+ rowAnnotation(
    scaled_min_mse = min_mses)

Je remets donc un z-score sur la MSE normalisée par gène englobant les deux méthodes, pour visualiser plus précisément les variations de MSE suivant alpha:

Heatmap((all_mses_scaled-rowMeans(all_mses_scaled))/matrixStats::rowSds(as.matrix(all_mses_scaled)), 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall,
        row_title = "Genes")+ rowAnnotation(
    scaled_min_mse = min_mses)

En fait, comme le z-score est calculé sur les deux méthodes, on a encore une grosse contribution à la couleur de la différence de MSE inhérente aux deux méthodes, et non à l’effet de l’intégration. Je calcule donc un z-score par méthodes pour faire la représentation:

mse_lasso_scaled_zscore <- (mse_lasso_scaled-rowMeans(mse_lasso_scaled, na.rm = T))/matrixStats::rowSds(as.matrix(mse_lasso_scaled, na.rm=T))
mse_rf_scaled_zscore <- (mse_rf_scaled-rowMeans(mse_rf_scaled))/matrixStats::rowSds(as.matrix(mse_rf_scaled))

col_fun = circlize::colorRamp2(c(-0.8, 0, 0.3), c("blue", "white", "red"))


Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore), 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, 
        row_title = "Genes")+ rowAnnotation(
    scaled_min_mse = min_mses_lasso-min_mses_rf, col = list(scaled_min_mse = col_fun))

Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore), 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, 
        row_title = "Genes")+ rowAnnotation(
    bad_for_lasso = bad_genes_lasso,
    bad_for_rf = bad_genes_rf)

c’est informatif : certains gènes bénéficient de l’intégration de données et d’autres non, et ce n’est pas spécialement lié à si on les prédit bien globalement. Par contre celà dépend visiblement de la méthode utilisée et nous n’avons pas encore d’explication pour ça…

Faire des clusters informatifs et clairs pour notre question

Définir des groupes suivant l’optimum comme par exemple:

get_optimum <- function(gene, method = "rf"){
  if(method=="rf")
    return(as.numeric(names(which.min(mse_rf_scaled_zscore[gene,]))))
  if(method=="lasso")
    return(as.numeric(names(which.min(mse_lasso_scaled_zscore[gene,]))))
}

assign_cluster <- function(opt){
  if(opt == 0)
    return("0")
  if(opt>0 & opt <=0.4){
    return("0.1-0.4")
  }
  if(opt>0.4 & opt <=0.7){
    return("0.5-0.7")
  }
  if(opt>0.7 & opt <=1){
    return("0.8-1")
  }
}
clusters_rf <- sapply(sapply(genes, get_optimum, method = "rf"), assign_cluster)
clusters_lasso <- sapply(sapply(genes, get_optimum, method = "lasso"), assign_cluster)
table(clusters_lasso)
## clusters_lasso
##       0 0.1-0.4 0.5-0.7   0.8-1 
##     635     474     197     120
table(clusters_rf)
## clusters_rf
##       0 0.1-0.4 0.5-0.7   0.8-1 
##     317     579     268     262
Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore), 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, 
        row_title = "Genes")+ rowAnnotation(
    clusters_rf = clusters_rf,
    clusters_lasso = clusters_lasso, 
    col=list(clusters_lasso = setNames(c("red","orange", "yellow", "green"), 
                                       nm = names(table(clusters_lasso))),
             clusters_rf= setNames(c("red","orange", "yellow", "green"), 
                                       nm = names(table(clusters_lasso))) ))

draw_specific_genes <- function(genes){
  Heatmap(cbind(mse_rf_scaled_zscore, mse_lasso_scaled_zscore)[genes,], 
        row_km = 0, cluster_columns = F, show_row_names = F, 
        name = "MSE/Var", top_annotation = hall, 
        row_title = "Genes")+ rowAnnotation(
    clusters_rf = clusters_rf[genes],
    clusters_lasso = clusters_lasso[genes], 
    col=list(clusters_lasso = setNames(c("red","orange", "yellow", "green"), 
                                       nm = names(table(clusters_lasso))),
             clusters_rf= setNames(c("red","orange", "yellow", "green"), 
                                       nm = names(table(clusters_lasso))) ))
}
all_pwm_pos <- names(which(clusters_lasso == "0.8-1" & clusters_rf == "0.8-1"))
lasso_bad_rf_good <- names(which(clusters_lasso == "0" & clusters_rf == "0.8-1"))
lasso_good_rf_bad <- names(which(clusters_lasso == "0.8-1" & clusters_rf == "0"))

all_yellow <-  names(which(clusters_lasso == "0.5-0.7" & clusters_rf == "0.5-0.7"))
all_orange <-  names(which(clusters_lasso == "0.1-0.4" & clusters_rf == "0.1-0.4"))
all_red <-  names(which(clusters_lasso == "0" & clusters_rf == "0"))

different <- names(which(clusters_lasso != clusters_rf))
draw_specific_genes(all_pwm_pos)

draw_specific_genes(c(lasso_bad_rf_good, lasso_good_rf_bad))

draw_specific_genes(all_yellow)

draw_specific_genes(all_orange)

draw_specific_genes(all_red)

draw_specific_genes(different)

Quelles sont les caractéristiques des clusters de gènes

Nombre de motifs dans le promoteur, niveau d’expression, ontologies, sont-ils des TFs? leur taille?

Quelle est la précision rappel des sous réseaux concernant ces gènes.

load("rdata/pwm_prom_jaspar_dap.rdata")

genes_info <- data.frame(genes = genes, 
                         cluster_lasso = clusters_lasso[genes], 
                         cluster_rf = clusters_rf[genes])
genes_info$is_tf <- genes %in% tfs
genes_info$var <- var_expr
genes_info$expr <- mean_expr
genes_info$nb_motifs <- table(pwm_prom$target)[genes]

genes_info%>%
  mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
ggplot(aes(x=cluster_rf, y=nb_motifs, 
                       fill = cluster_rf==cluster_lasso)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
  ggtitle(("Number of motifs in promoter"))
## Don't know how to automatically pick scale for object of type <table>.
## Defaulting to continuous.

genes_info%>%
  mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
  ggplot(aes(x=cluster_rf, y=var, 
                       fill = cluster_rf==cluster_lasso)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
  ggtitle(("Gene variance for all groups")) + scale_y_log10()

genes_info%>%
  ggplot(aes(x=cluster_rf, y=var)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for RF groups")) + scale_y_log10()+
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=cluster_lasso, y=var)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene variance for LASSO groups")) + scale_y_log10()+
  stat_compare_means()

genes_info%>%
  mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
  ggplot(aes(x=cluster_rf, y=expr, 
             fill = cluster_rf==cluster_lasso)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  facet_wrap(~cluster_lasso_label )+ xlab("cluster RF") +
  ggtitle(("Gene expression for all groups")) + scale_y_log10()

genes_info%>%
  mutate(cluster_lasso_label=paste("lasso :", cluster_lasso))%>%
  mutate(agree_on_beneficial_pwms = cluster_lasso == "0.8-1" & cluster_lasso==cluster_rf) %>%
  ggplot(aes(x=agree_on_beneficial_pwms, y=expr, 
             fill = agree_on_beneficial_pwms)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  xlab("cluster RF") +
  ggtitle(("Gene expression : PWM beneficial for both methods vs other groups")) + scale_y_log10()

genes_info%>%
  ggplot(aes(x=cluster_rf, y=expr)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for RF groups")) + scale_y_log10()+
  stat_compare_means()

genes_info%>%
  ggplot(aes(x=cluster_lasso, y=var)) + 
  geom_violin()+geom_jitter(width=0.1)+
  geom_boxplot(width=0.1, fill = "white")+
  ggtitle(("Gene expression for LASSO groups")) + scale_y_log10()+
  stat_compare_means()

clusters_info <- genes_info %>%
  group_by(cluster_lasso, cluster_rf) %>%
  summarise(n=n(), 
            tf_frac=sum(is_tf)/n())
## `summarise()` has grouped output by 'cluster_lasso'. You can override using the
## `.groups` argument.
ggplot(clusters_info, aes(x=cluster_rf, y=tf_frac, 
                          label = paste("n=",n),
                          fill = cluster_rf==cluster_lasso)) + 
  geom_bar(stat = "identity")+
  facet_wrap(~cluster_lasso )+
  geom_hline(yintercept = length(tfs)/length(genes)) + 
  geom_text(y=0.22) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in all groups")

genes_info %>%
  group_by(cluster_lasso) %>%
  summarise(n=n(), 
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=cluster_lasso, y=tf_frac, 
                          label = paste("n=",n))) + 
  geom_bar(stat = "identity")+
  geom_hline(yintercept = length(tfs)/length(genes)) + 
  geom_text(y=0.16) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in LASSO groups")

genes_info %>%
  group_by(cluster_rf) %>%
  summarise(n=n(), 
            tf_frac=sum(is_tf)/n()) %>%
  ggplot(aes(x=cluster_rf, y=tf_frac, 
                          label = paste("n=",n))) + 
  geom_bar(stat = "identity")+
  geom_hline(yintercept = length(tfs)/length(genes)) + 
  geom_text(y=0.15) + xlab("cluster RF") +
  ggtitle("Fraction of TFs in RF groups")

Est-ce qu’on a significativement plus de TFs dans les genes qui sont bien prédits avec les PWM chez le lasso?

fisher.test(matrix(c(length(genes), length(tfs), 120, 120*0.175), 
                   nrow = 2), alternative = "greater")
## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(length(genes), length(tfs), 120, 120 * 0.175), nrow = 2)
## p-value = 0.2257
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.7895277       Inf
## sample estimates:
## odds ratio 
##   1.241379